4 Main analysis (Exploratory Data Analysis)
4.1 Summary Measure of Health
We remove the NA’s and take the average over counties’ value for each state. The bar charts visualize the ordering of the amount in all four factors crossing 50 states:
summary_measure_df1 <- summary_measure_df1[complete.cases(summary_measure_df1), ]
summary_measure_state <- summary_measure_df1%>%
group_by(CHSI_State_Abbr) %>%
summarise(meanALE = mean(ALE,rm.na=TRUE), mAD = mean(All_Death),mHS= mean(Health_Status), mUD =mean(Unhealthy_Days))%>%
mutate(meanALE = meanALE, meanAll_Death = mAD, meanHealth_Status = mHS, meanUnhealthy_Days=mUD)
# Average Life Expectancy — This represents the average number of years that a baby born in 1990 is expected to live if current mortality trends continue to apply.
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanALE),meanALE))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Average Life Expectancy"))
# All_Death: Mortality from any cause is the average annual rate of all causes of death.
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanAll_Death),meanAll_Death))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for All Death"))
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanHealth_Status),meanHealth_Status))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Self-rated Health Status"))
# The average number of unhealthy days (mental or physical) in the past 30 days, reported by adults age 18 and older is provided,
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanUnhealthy_Days),meanUnhealthy_Days))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Unhealthy Days"))
# Output the cleaned datafile
# write.csv(summary_measure_state, file = "summary_measure_state.csv", row.names = FALSE)
Main conclusion from bar chart plots:
Washintong, D.C has the shortest ALE value while Hawaii state has the largest value of ALE. The variance of
ALEis quite small as it ranges from 72 years to 79.47 years.Plots for
Unhealthy DaysandAll Deathhave consistent finding where Hawaii has the smallest value. West Virginia has the largest value of unhealthy days, and Mississippi has the largest value of all deaths.However, the plots for self-rated
Healthy Statusshows interesting results where the states with a larger value ofUnhealthy Daystends to have a higher rating for theirHealth Status.
4.3 Risk Factors
We studied the geographical patterns of risk factors and the relationship between them.
For the obesity, which is our variable of primal risk factor here, we plotted its gray scale map to the county level and major cities has been marked out in the plot as red crosses.
library(maps)
risk_factor = risk_factor[ which(risk_factor$Obesity > 0),]
toFIPS = function(state, county) {
state = sprintf("%02d", state)
county = sprintf("%03d", county)
return(as.numeric(paste0(state,county)))
}
toZIP = function(state, county, ct) {
if (length(which(ct$STATE == state && ct$COUNTY == county)) == 0) {
return("-1")
}
return(ct[which(ct$STATE == state && ct$COUNTY == county), 'ZCTA5'])
}
plot_df = data.frame(region = vector(length = nrow(risk_factor)), value = vector(length = nrow(risk_factor)))
for (i in 1:nrow(risk_factor)) {
plot_df[i, "region"] = toFIPS(risk_factor[i, "State_FIPS_Code"], risk_factor[i, "County_FIPS_Code"])
plot_df[i, "value"] = gray(abs(risk_factor[i, "Obesity"] / max(risk_factor[,"Obesity"])))
}
maps::map("county", fill=TRUE, col=plot_df$value)
maps::map.cities(x = us.cities, country = "", label = NULL, minpop = 0,
maxpop = Inf, capitals = 2, cex = 2, projection = FALSE,
parameters = NULL, orientation = NULL, pch = 3,col="red")
It can be observed that
- the Southern states tend to be more obsessed that other parts of the US.
- people in the major cities seem to be more obsessed
We then study for the relationship between obesity and other factors.
risk$diabete = diabete$x
risk$few_fruit = few_fruit$x
risk$High_Blood_Pres = High_Blood_Pres$x
theme_dotplot <- theme_bw(18) +
theme(axis.text.y = element_text(size = rel(.75)),
axis.ticks.y = element_blank(),
axis.title.x = element_text(size = rel(.75)),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(size = 0.5),
panel.grid.minor.x = element_blank())
ggplot() + geom_point(data=risk,
aes(x = x,
y = fct_reorder(Abbr, x), color = "green")) +
geom_point(data=risk,
aes(x = no_ex,
y = fct_reorder(Abbr, no_ex), color = "red")) +
geom_point(data=risk,
aes(x = few_fruit,
y = fct_reorder(Abbr, few_fruit), color = "blue")) +
geom_point(data=risk,
aes(x = diabete,
y = fct_reorder(Abbr, diabete), color = "orange")) +
geom_point(data=risk,
aes(x = High_Blood_Pres,
y = fct_reorder(Abbr, High_Blood_Pres), color = "purple")) +
scale_colour_manual(name = 'Variables',
values =c("green"="green","red"="red", "blue" = "blue", "orange" = "orange", "purple" = "purple"),
labels = c("green"='Obesity Index', "red"='No-excercise Index', "blue" ='Few Fruit Index',"orange" = 'Diabete Index',"purple" = 'High Blood Pressure Index'),
breaks=c("green", "red","blue", "orange", "purple")) +
ylab("") + xlab("Index") + theme_dotplot
It can be seen that there exists a strong correlation among these health risk factors, and the Southern states ranked higher on this Cleveland plot.